title

Identification of the T. brucei

Wellcome Centre for Anti-Infectives Research
School of Life Sciences, University of Dundee
In [1]:
#reload when modified
%load_ext autoreload
%autoreload 2
In [4]:
#set up code
import sys
sys.path.append('../../ProLib/')
from ProteomicsUtility import utilities as PTUT
import ProtRank
import warnings
warnings.filterwarnings("ignore")
#define helphttp://localhost:8888/notebooks/calvin/new_data/analysis_427_2018.ipynb#ing function
import os
from tqdm import tqdm_notebook
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from adjustText import adjust_text
from matplotlib.lines import Line2D
from Bio import SeqIO
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
In [5]:
#E:\old_ege_nucleus\nuc_txt\txt
RAW_FILE_PATH =os.path.join('')
PREFIX = 'combined'
#PREFIX = 'combined_add_esag_unique'
TXT_PATH=os.path.join(RAW_FILE_PATH, PREFIX, 'txt')
In [6]:
OUT_FOLDER = os.path.join(TXT_PATH, 'results')
In [7]:
import os
if not os.path.exists(OUT_FOLDER):
    os.makedirs(OUT_FOLDER)
    print('created out folder')
else:
    print('out folder exist')
created out folder

Load data

In [8]:
#pd.DataFrame.sort_values??
In [10]:
#read data and log transform for plots
df = pd.read_csv(os.path.join(TXT_PATH,'proteinGroups.txt'),sep='\t')
df = PTUT.clean_df(df, score=5, unique_pep_threshold=1)
df = PTUT.mod_df(df)
print(df.shape)
starting from: (6535, 63)
removed  71 Protein Groups by: Only identified by site
tot  71  entries removed
---------------
removed  64 Protein Groups by: Reverse
tot  135  entries removed
---------------
removed  88 Protein Groups by: Potential contaminant
tot  223  entries removed
---------------
removed  571 Protein Groups by: Score
tot  794  entries removed
---------------
Peptide counts (razor+unique)
nothing removed
---------------
(5741, 66)
In [11]:
df['pep_stats']=[int(n.split(';')[0]) for n in df['Peptide counts (all)']]
In [12]:
df['pep_stats'].describe()
Out[12]:
count    5741.000000
mean       13.817453
std        14.397654
min         1.000000
25%         6.000000
50%        10.000000
75%        17.000000
max       258.000000
Name: pep_stats, dtype: float64
In [13]:
desc_927= dict(zip(df['Gene_id'],df['desc']))
In [14]:
list(df.columns)[0:10]
Out[14]:
['Protein IDs',
 'Majority protein IDs',
 'Peptide counts (all)',
 'Peptide counts (razor+unique)',
 'Peptide counts (unique)',
 'Fasta headers',
 'Number of proteins',
 'Peptides',
 'Razor + unique peptides',
 'Unique peptides']
In [16]:
list(df.columns)
Out[16]:
['Protein IDs',
 'Majority protein IDs',
 'Peptide counts (all)',
 'Peptide counts (razor+unique)',
 'Peptide counts (unique)',
 'Fasta headers',
 'Number of proteins',
 'Peptides',
 'Razor + unique peptides',
 'Unique peptides',
 'Sequence coverage [%]',
 'Unique + razor sequence coverage [%]',
 'Unique sequence coverage [%]',
 'Mol. weight [kDa]',
 'Sequence length',
 'Sequence lengths',
 'Q-value',
 'Score',
 'Reporter intensity corrected 1',
 'Reporter intensity corrected 2',
 'Reporter intensity corrected 3',
 'Reporter intensity corrected 4',
 'Reporter intensity corrected 5',
 'Reporter intensity corrected 6',
 'Reporter intensity corrected 7',
 'Reporter intensity corrected 8',
 'Reporter intensity corrected 9',
 'Reporter intensity corrected 10',
 'Reporter intensity 1',
 'Reporter intensity 2',
 'Reporter intensity 3',
 'Reporter intensity 4',
 'Reporter intensity 5',
 'Reporter intensity 6',
 'Reporter intensity 7',
 'Reporter intensity 8',
 'Reporter intensity 9',
 'Reporter intensity 10',
 'Reporter intensity count 1',
 'Reporter intensity count 2',
 'Reporter intensity count 3',
 'Reporter intensity count 4',
 'Reporter intensity count 5',
 'Reporter intensity count 6',
 'Reporter intensity count 7',
 'Reporter intensity count 8',
 'Reporter intensity count 9',
 'Reporter intensity count 10',
 'Intensity',
 'MS/MS count',
 'Only identified by site',
 'Reverse',
 'Potential contaminant',
 'id',
 'Peptide IDs',
 'Peptide is razor',
 'Mod. peptide IDs',
 'Evidence IDs',
 'MS/MS IDs',
 'Best MS/MS',
 'Oxidation (M) site IDs',
 'Oxidation (M) site positions',
 'Taxonomy IDs',
 'unique_int',
 'Gene_id',
 'desc',
 'pep_stats']
In [17]:
wt_headers = [
 'Reporter intensity corrected 7',
 'Reporter intensity corrected 8',
 'Reporter intensity corrected 9',]

plus_headers = [
 'Reporter intensity corrected 4',
 'Reporter intensity corrected 5',
 'Reporter intensity corrected 6']

minus_headers =[
 'Reporter intensity corrected 1',
 'Reporter intensity corrected 2',
 'Reporter intensity corrected 3']
In [18]:
#list(df.columns)
In [19]:
cols = wt_headers+plus_headers+minus_headers
selection = df[cols]
selection.head()
Out[19]:
Reporter intensity corrected 7 Reporter intensity corrected 8 Reporter intensity corrected 9 Reporter intensity corrected 4 Reporter intensity corrected 5 Reporter intensity corrected 6 Reporter intensity corrected 1 Reporter intensity corrected 2 Reporter intensity corrected 3
194 138240.0 157210.0 137240.0 175120.0 173400.0 176710.0 168770.0 178260.0 166710.0
195 1083700.0 1218100.0 1102700.0 1344500.0 1334100.0 1318900.0 1424000.0 1467700.0 1358100.0
196 540130.0 604110.0 556550.0 586730.0 560580.0 571870.0 557320.0 588210.0 540260.0
197 1065400.0 1189700.0 1100500.0 1244600.0 1257100.0 1226500.0 1233200.0 1259100.0 1176500.0
198 441310.0 505710.0 445210.0 535490.0 522490.0 565460.0 553280.0 575600.0 544740.0
In [20]:
print(selection.shape)
selection = selection[(selection.T != 0).any()]
print(selection.shape)
(5741, 9)
(5738, 9)
In [21]:
selection.describe()
Out[21]:
Reporter intensity corrected 7 Reporter intensity corrected 8 Reporter intensity corrected 9 Reporter intensity corrected 4 Reporter intensity corrected 5 Reporter intensity corrected 6 Reporter intensity corrected 1 Reporter intensity corrected 2 Reporter intensity corrected 3
count 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03
mean 3.330544e+05 3.794479e+05 3.473265e+05 3.822329e+05 3.775751e+05 3.779895e+05 3.663564e+05 3.831168e+05 3.523493e+05
std 7.058981e+05 8.283221e+05 7.361671e+05 7.619829e+05 7.486934e+05 7.611123e+05 7.570838e+05 7.989871e+05 7.301619e+05
min 2.730600e+02 2.253200e+02 2.104500e+02 4.667900e+02 4.225600e+02 4.390800e+02 0.000000e+00 0.000000e+00 3.669500e+02
25% 7.637275e+04 8.646750e+04 8.055525e+04 8.968650e+04 8.865275e+04 8.900600e+04 8.013850e+04 8.447650e+04 7.774200e+04
50% 1.689800e+05 1.921800e+05 1.781850e+05 1.993850e+05 1.981850e+05 1.979200e+05 1.877650e+05 1.954450e+05 1.802850e+05
75% 3.695525e+05 4.210300e+05 3.867025e+05 4.339400e+05 4.290275e+05 4.280525e+05 4.134975e+05 4.302250e+05 3.973400e+05
max 2.959500e+07 3.607500e+07 3.111600e+07 3.114700e+07 3.029300e+07 3.152300e+07 3.223500e+07 3.449300e+07 3.118700e+07
In [23]:
new_dm_columns = ['WT'+str(n+1) for n in range(3)]
new_plus_columns= ['Plus'+str(n+1) for n in range(3)]
new_minus_columns= ['Minus'+str(n+1) for n in range(3)]
selection.columns = new_dm_columns+new_plus_columns+new_minus_columns

Make ProtRank input

In [24]:
temp = pd.concat([df[['Gene_id']], selection],axis=1)
temp = temp[(temp.iloc[:,1:].T != 0).any()]
temp.to_csv(os.path.join(OUT_FOLDER,'indata_prank.txt'),sep='\t',index=False)

Missing values

In [ ]:
 
In [25]:
import missingno as msno
#visualization of missing data
ax=msno.bar(selection.replace(0,np.nan),figsize=(6, 6))
plt.title('Missing Data Analysis', size=12)
ax.set_ylabel('Fraction of data points',size=12)
plt.savefig(os.path.join(OUT_FOLDER,'missing_value_bar.png'))
plt.tight_layout()
plt.show()
In [26]:
#print(data.shape)
msno.matrix(selection.replace(0,np.nan), figsize=(6, 6))
#plt.title('Missing Data')
plt.savefig(os.path.join(OUT_FOLDER,'missing_value_matrix.png'))
plt.show()

Value distibution

Denisty

In [27]:
#pal = sns.color_palette()
In [41]:
palette=['r','r','r','g','g','g','b','b','b']
palette_g = ['r','g','b']
color_dictionary = {'r':'WT','g':'Plus','b':'Minus'}
In [42]:
np.log10(selection).head()
Out[42]:
WT1 WT2 WT3 Plus1 Plus2 Plus3 Minus1 Minus2 Minus3
194 5.140634 5.196480 5.137481 5.243336 5.239049 5.247261 5.227295 5.251054 5.221962
195 6.034909 6.085683 6.042457 6.128561 6.125188 6.120212 6.153510 6.166637 6.132932
196 5.732498 5.781116 5.745504 5.768438 5.748638 5.757297 5.746105 5.769532 5.732603
197 6.027513 6.075437 6.041590 6.095030 6.099370 6.088668 6.091034 6.100060 6.070592
198 5.644744 5.703902 5.648565 5.728751 5.718078 5.752402 5.742945 5.760121 5.736189
In [ ]:
 
In [43]:
np.log10(selection+1).plot(kind='kde', color=palette, alpha=0.5)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Value Distribution')
plt.xlabel('Log10 TMT')
plt.savefig(os.path.join(OUT_FOLDER,'value_distribution_kde.png'))
plt.show()

Box plot

In [44]:
sns.boxplot(data =np.log10(selection+1),showfliers=False,palette=palette)
plt.title('Value Distribution')
plt.xlabel('Log10 LFQ')
plt.savefig(os.path.join(OUT_FOLDER,'value_distribution_box.png'))
plt.show()

Replication and reproducibility

Heatmaps

replace zeros with small values

In [45]:
import seaborn as sns
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(np.log1p(selection).dropna().corr(), 
            annot=True, linewidths=.5, ax=ax,cmap="Blues")
plt.savefig(os.path.join(OUT_FOLDER,'corr_heatmap.png'))
plt.show()

remove any zeros

Scatter Plots

In [35]:
#utilities.plot_correlation(.sample(500))
#fig,ax=plt.subplots(figsize=(8,8))
g = sns.pairplot(np.log1p(selection).dropna().sample(500))
plt.savefig(os.path.join(OUT_FOLDER,'corr_pairplot.png'))
plt.show()

coefficient of variation

In [46]:
con_columns = ['WT'+str(n+1) for n in range(3)]
ind_columns= ['Plus'+str(n+1) for n in range(3)]
ind_r_columns = ['Minus'+str(n+1) for n in range(3)]

import seaborn as sns
cv = selection.copy()
cv = cv.dropna()
cv['mean_c'] = cv[con_columns].mean(axis=1)
cv['std_c'] = cv[con_columns].std(axis=1)

cv['mean_i'] = cv[ind_columns].mean(axis=1)
cv['std_i'] = cv[ind_columns].std(axis=1)


cv['mean_ir'] = cv[ind_columns].mean(axis=1)
cv['std_ir'] = cv[ind_columns].std(axis=1)

cv['cv_DM'] = cv['std_c']/cv['mean_c']
cv['cv_Plus'] = cv['std_i']/cv['mean_i']
cv['cv_Minus'] = cv['std_ir']/cv['mean_ir']
fig,ax=plt.subplots(figsize=(8,6))
sns.boxplot(data=cv[['cv_DM','cv_Plus','cv_Minus']],palette=palette_g,
            showfliers=False,ax=ax)
#plt.savefig(os.path.join('Fig_S3A_cv.png'))
plt.savefig(os.path.join(OUT_FOLDER,'cv.png'))
plt.show()
In [47]:
cv.plot(x='mean_c',y='cv_DM',kind='scatter',marker='.')
plt.xscale('log')
plt.show()
cv.plot(x='mean_i',y='cv_Minus',kind='scatter',marker='.')
plt.xscale('log')
plt.show()
cv.plot(x='mean_ir',y='cv_Plus',kind='scatter',marker='.')
plt.xscale('log')
plt.show()

Dimensionality reduction

MSD

In [48]:
color_dictionary
Out[48]:
{'r': 'WT', 'g': 'Plus', 'b': 'Minus'}
In [49]:
fig, ax = plt.subplots()

ax = PTUT.make_mds(selection, palette, ax, top=500,
                   color_dictionary=color_dictionary)
plt.savefig(os.path.join(OUT_FOLDER,'mds.png')) 
{'r': 'WT', 'g': 'Plus', 'b': 'Minus'}

PCA

In [50]:
fig, ax = plt.subplots()
ax=PTUT.make_pca(selection, palette, ax, top=500,
                  color_dictionary=color_dictionary)
plt.savefig(os.path.join(OUT_FOLDER,'pca.png')) 
[0.99439722 0.00477104]
{'r': 'WT', 'g': 'Plus', 'b': 'Minus'}

make polystest out

In [51]:
selection.describe()
Out[51]:
WT1 WT2 WT3 Plus1 Plus2 Plus3 Minus1 Minus2 Minus3
count 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03 5.738000e+03
mean 3.330544e+05 3.794479e+05 3.473265e+05 3.822329e+05 3.775751e+05 3.779895e+05 3.663564e+05 3.831168e+05 3.523493e+05
std 7.058981e+05 8.283221e+05 7.361671e+05 7.619829e+05 7.486934e+05 7.611123e+05 7.570838e+05 7.989871e+05 7.301619e+05
min 2.730600e+02 2.253200e+02 2.104500e+02 4.667900e+02 4.225600e+02 4.390800e+02 0.000000e+00 0.000000e+00 3.669500e+02
25% 7.637275e+04 8.646750e+04 8.055525e+04 8.968650e+04 8.865275e+04 8.900600e+04 8.013850e+04 8.447650e+04 7.774200e+04
50% 1.689800e+05 1.921800e+05 1.781850e+05 1.993850e+05 1.981850e+05 1.979200e+05 1.877650e+05 1.954450e+05 1.802850e+05
75% 3.695525e+05 4.210300e+05 3.867025e+05 4.339400e+05 4.290275e+05 4.280525e+05 4.134975e+05 4.302250e+05 3.973400e+05
max 2.959500e+07 3.607500e+07 3.111600e+07 3.114700e+07 3.029300e+07 3.152300e+07 3.223500e+07 3.449300e+07 3.118700e+07
In [52]:
#PTUT??
In [53]:
normed_df = PTUT.norm_loading_TMT(selection)
normed_df.head()
Out[53]:
WT1 WT2 WT3 Plus1 Plus2 Plus3 Minus1 Minus2 Minus3
194 1.521659e+05 1.518892e+05 1.448577e+05 1.679603e+05 1.683622e+05 1.713879e+05 1.688847e+05 1.705774e+05 1.734551e+05
195 1.192869e+06 1.176873e+06 1.163907e+06 1.289530e+06 1.295341e+06 1.279178e+06 1.424968e+06 1.404446e+06 1.413049e+06
196 5.945412e+05 5.836639e+05 5.874422e+05 5.627417e+05 5.442935e+05 5.546467e+05 5.576988e+05 5.628596e+05 5.621190e+05
197 1.172725e+06 1.149435e+06 1.161585e+06 1.193715e+06 1.220578e+06 1.189561e+06 1.234038e+06 1.204836e+06 1.224101e+06
198 4.857664e+05 4.885942e+05 4.699221e+05 5.135966e+05 5.073102e+05 5.484298e+05 5.536561e+05 5.507931e+05 5.667803e+05
In [55]:
fig, ax = plt.subplots()

ax = PTUT.make_mds(normed_df, palette, ax, top=500,
                   color_dictionary=color_dictionary)
plt.savefig(os.path.join(OUT_FOLDER,'mds.png')) 
{'r': 'WT', 'g': 'Plus', 'b': 'Minus'}
In [56]:
sns.boxplot(data =np.log10(normed_df),showfliers=False,palette=palette)
plt.title('Value Distribution')
plt.xlabel('Log10 LFQ')
plt.savefig(os.path.join(OUT_FOLDER,'value_distribution_box_normed.png'))
plt.show()
In [57]:
np.log10(normed_df).describe()
Out[57]:
WT1 WT2 WT3 Plus1 Plus2 Plus3 Minus1 Minus2 Minus3
count 5738.000000 5738.000000 5738.000000 5738.000000 5738.000000 5738.000000 5738.000000 5738.000000 5738.000000
mean 5.252579 5.251916 5.256251 5.260803 5.262743 5.263088 -inf -inf 5.247917
std 0.530424 0.530413 0.527557 0.528111 0.526560 0.524655 NaN NaN 0.540969
min 2.477942 2.337847 2.346610 2.650992 2.613084 2.629263 -inf -inf 2.581832
25% 4.924622 4.921900 4.929555 4.934598 4.934888 4.936138 4.904136 4.907604 4.907881
50% 5.269519 5.268755 5.274332 5.281563 5.284266 5.283209 5.273910 5.271892 5.273185
75% 5.609360 5.609360 5.610838 5.619300 5.619681 5.618216 5.616767 5.614563 5.616388
max 7.512902 7.542253 7.516445 7.475287 7.468538 7.485347 7.508623 7.518599 7.511199
In [58]:
normed_df.columns
Out[58]:
Index(['WT1', 'WT2', 'WT3', 'Plus1', 'Plus2', 'Plus3', 'Minus1', 'Minus2', 'Minus3'], dtype='object')
In [ ]:
 
In [59]:
OUT_FOLDER
Out[59]:
'combined/txt/results'
In [60]:
temp = df[['Gene_id','desc']].join(
    np.log10(normed_df),how='right').replace(-np.inf,np.nan)
temp.to_csv(
    os.path.join(OUT_FOLDER, 'indata_PolySTest.csv'),index=False)

Control Proteins

In [61]:
def plot_bar(ingene='Tb427_000014300.1-p1',log=False,title='ESAG7:Tb427_000014300'):
    temp = pd.read_csv(os.path.join(TXT_PATH,'proteinGroups.txt'),sep='\t')
    fig,ax=plt.subplots()
    s= temp[temp['Protein IDs'].str.contains(ingene)][cols]
    s.columns = con_columns+ind_columns+ind_r_columns
    if log:  
        np.log10(s).plot(kind='bar',color=palette,ax=ax)  
    else:
        s.plot(kind='bar',color=palette,ax=ax)
    ax.legend(title='Groups',loc='center left', bbox_to_anchor=(1, 0.9))
    plt.title(title)
    plt.savefig(os.path.join(OUT_FOLDER,ingene+'.png'))
    plt.show()
    #if ingene.replace('.1-p1','') in mapping_927_dict:
        #for n in mapping_927_dict[ingene.replace('.1-p1','')]:
            #print(n)
    print (desc_927[ingene])
    print(list(temp[temp['Protein IDs'].str.contains(ingene)]['Protein IDs']))
    print('unique peptides:',
          list(temp[temp['Protein IDs'].str.contains(ingene)]['Unique peptides']))
    list(temp[temp['Protein IDs'].str.contains(ingene)]['Peptide counts (all)'])
In [ ]:
 

Tb927.5.4450

In [62]:
plot_bar(ingene='Tb927.5.4450',log=False,title='Tb927.5.4450')
hypothetical protein, conserved
['Tb927.5.4450:mRNA-p1;Tb05.5K5.100:mRNA-p1']
unique peptides: [8]

ProtRank analysis

In [63]:
import ProtRank
In [64]:
data =ProtRank.load_data(os.path.join(OUT_FOLDER,'indata_prank.txt'), 
          separator = '\t',
          ignore_cols = [], index_col = 0, comments = '#')
data.head()
loaded input data with 5741 proteins that have been measured under 9 different conditions
Out[64]:
WT1 WT2 WT3 Plus1 Plus2 Plus3 Minus1 Minus2 Minus3
Gene_id
Tb927.5.4450 138240.0 157210.0 137240.0 175120.0 173400.0 176710.0 168770.0 178260.0 166710.0
Tb927.5.4460 1083700.0 1218100.0 1102700.0 1344500.0 1334100.0 1318900.0 1424000.0 1467700.0 1358100.0
Tb927.5.4470 540130.0 604110.0 556550.0 586730.0 560580.0 571870.0 557320.0 588210.0 540260.0
Tb927.5.4480 1065400.0 1189700.0 1100500.0 1244600.0 1257100.0 1226500.0 1233200.0 1259100.0 1176500.0
Tb927.5.4500 441310.0 505710.0 445210.0 535490.0 522490.0 565460.0 553280.0 575600.0 544740.0
In [65]:
#ProtRank.save_results??
In [66]:
OUT_FOLDER
Out[66]:
'combined/txt/results'

Test 1

In [72]:
what_to_compare = [[['WT1', 'Plus1'], ['WT2', 'Plus2'], ['WT3', 'Plus3']]]
ProtRank.data_stats(data, what_to_compare = what_to_compare, ignore_missed = True)
description = 'wt_vs_plus_sample_dataset'
significant_proteins = ProtRank.rank_proteins(
    data, what_to_compare, description, path_to=OUT_FOLDER)
input data contain results for 5741 proteins and 9 different conditions
list of measured conditions: WT1, WT2, WT3, Plus1, Plus2, Plus3, Minus1, Minus2, Minus3
in the data, 0.1% of all counts are zeros

basic statistics for the subset of the data corresponding to the provided comparisons:
6 comparisons provided: [[['WT1', 'Plus1'], ['WT2', 'Plus2'], ['WT3', 'Plus3']]]
after ignoring 3 rows with only zero counts, 5738 rows remain
in the analyzed data, 0.0% of all counts are zeros
median count is 1.89e+05 (computed over non-zero entries only)
ratio between the largest and the smallest non-zero count is 1.71e+05
ratio between the 90th and the 10th percentile non-zero count is 2.04e+01
statistics of irregular missing values:
  in total, there are 0 comparisons involving a zero and a non-zero value (0.0% of all)
  out of 17214 comparisons, 0 involve a zero value and a non-zero exceeding 1.0 * median (0.0% of all)
  (the smaller the fraction, the smaller the problem with irregular zeros in the data)

there are 1 groups of comparisons provided, they contain the following number of pairs: [3]
in total, 3 comparisons will be included in the analysis
for the given comparisons, 5738 rows with at least one nonzero count remain
100 bootstrap realizations: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 
wt_vs_plus_sample_dataset: there are 950 proteins that have FDR 0.1 or less

Test 2

In [73]:
what_to_compare = [[['Plus1', 'Minus1'], ['Plus2', 'Minus2'], ['Plus3', 'Minus3']]]
description = 'plus_vs_minus_sample_dataset'
significant_proteins_r = ProtRank.rank_proteins(
    data, what_to_compare, description, path_to=OUT_FOLDER)
there are 1 groups of comparisons provided, they contain the following number of pairs: [3]
in total, 3 comparisons will be included in the analysis
for the given comparisons, 5738 rows with at least one nonzero count remain
100 bootstrap realizations: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 
plus_vs_minus_sample_dataset: there are 755 proteins that have FDR 0.1 or less
In [74]:
common = set(significant_proteins) & set(significant_proteins_r)
to_viz = [n for n in significant_proteins if n in common]
print(len(to_viz))
202

Test 3

In [78]:
what_to_compare = [[['WT1', 'Minus1'], ['WT2', 'Minus2'], ['WT3', 'Minus3']]]
description = 'wt_vs_minus_sample_dataset'
significant_proteins_r = ProtRank.rank_proteins(
    data, what_to_compare, description, path_to=OUT_FOLDER)
there are 1 groups of comparisons provided, they contain the following number of pairs: [3]
in total, 3 comparisons will be included in the analysis
for the given comparisons, 5738 rows with at least one nonzero count remain
100 bootstrap realizations: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 
wt_vs_minus_sample_dataset: there are 1021 proteins that have FDR 0.1 or less
In [79]:
def parse_prank_out(in_folder = OUT_FOLDER, infile='prs-con1_vs_ind1_sample_dataset.dat',suffix = 'a'):
    df = pd.read_csv(os.path.join(in_folder, infile),
                       sep='\t', comment='#', index_col=[1],
                       names=['id','rank','FDR','sign'])
    df.columns = [n+'_'+suffix for n in df.columns]
    df['log_FDR_'+suffix]=-np.log10(df['FDR'+'_'+suffix])
    df['log_rank_'+suffix]=np.log10(df['rank'+'_'+suffix])
    df['srank_'+suffix]=[n*1 if a=='+' else n*-1 for n,a in 
                         zip(df['rank'+'_'+suffix],df['sign'+'_'+suffix])]
    return df
In [80]:
df_1 = parse_prank_out(infile='prs-plus_vs_minus_sample_dataset.dat',suffix = 'a')
df_2 = parse_prank_out(infile='prs-wt_vs_plus_sample_dataset.dat',suffix = 'b')
df_3 = parse_prank_out(infile='prs-wt_vs_minus_sample_dataset.dat',suffix = 'c')

print(df_1.shape,df_2.shape,df_3.shape)
merge = df_1.join(df_2).join(df_3)
print(merge.shape)
(5738, 7) (5738, 7) (5738, 7)
(5738, 21)
In [81]:
merge.sort_values('srank_a').head()
Out[81]:
id_a rank_a FDR_a sign_a log_FDR_a log_rank_a srank_a id_b rank_b FDR_b ... log_FDR_b log_rank_b srank_b id_c rank_c FDR_c sign_c log_FDR_c log_rank_c srank_c
Tb927.9.3600 1 816.848946 0.0 - inf 2.912142 -816.848946 4 517.290218 0.00000 ... inf 2.713734 517.290218 485 28.964756 0.02806 - 1.551912 1.461870 -28.964756
Tb927.10.1870 3 561.369581 0.0 - inf 2.749249 -561.369581 3690 1.249182 0.58634 ... 0.231850 0.096626 -1.249182 4 493.997633 0.00000 - inf 2.693725 -493.997633
Tb927.11.8040 4 395.684030 0.0 - inf 2.597349 -395.684030 2875 2.256478 0.43828 ... 0.358248 0.353431 2.256478 19 249.202201 0.00000 - inf 2.396552 -249.202201
Tb927.11.870 6 388.526507 0.0 - inf 2.589421 -388.526507 4985 0.521379 0.78097 ... 0.107366 -0.282846 -0.521379 9 335.702623 0.00000 - inf 2.525955 -335.702623
Tb927.6.4080 8 333.369068 0.0 - inf 2.522925 -333.369068 3604 1.319005 0.57442 ... 0.240770 0.120246 1.319005 20 248.160203 0.00000 - inf 2.394732 -248.160203

5 rows × 21 columns

In [82]:
data.head()
Out[82]:
WT1 WT2 WT3 Plus1 Plus2 Plus3 Minus1 Minus2 Minus3
Gene_id
Tb927.5.4450 138240.0 157210.0 137240.0 175120.0 173400.0 176710.0 168770.0 178260.0 166710.0
Tb927.5.4460 1083700.0 1218100.0 1102700.0 1344500.0 1334100.0 1318900.0 1424000.0 1467700.0 1358100.0
Tb927.5.4470 540130.0 604110.0 556550.0 586730.0 560580.0 571870.0 557320.0 588210.0 540260.0
Tb927.5.4480 1065400.0 1189700.0 1100500.0 1244600.0 1257100.0 1226500.0 1233200.0 1259100.0 1176500.0
Tb927.5.4500 441310.0 505710.0 445210.0 535490.0 522490.0 565460.0 553280.0 575600.0 544740.0
In [83]:
data.columns
Out[83]:
Index(['WT1', 'WT2', 'WT3', 'Plus1', 'Plus2', 'Plus3', 'Minus1', 'Minus2', 'Minus3'], dtype='object')
In [84]:
print(merge.shape)
merge = merge.join(data)
print(merge.shape)
(5738, 21)
(5738, 30)
In [86]:
merge['log2FC_a'] = np.log2(merge[['Plus1', 'Plus2', 'Plus3']].mean(axis=1)
                            /merge[['Minus1', 'Minus2', 'Minus3']].mean(axis=1))


merge['log2FC_b'] = np.log2(merge[['WT1', 'WT2', 'WT3']].mean(axis=1)
                            /  merge[['Plus1', 'Plus2', 'Plus3']].mean(axis=1))

merge['log2FC_c'] = np.log2(merge[['WT1', 'WT2', 'WT3']].mean(axis=1)
                            /  merge[['Minus1', 'Minus2', 'Minus3']].mean(axis=1))
In [87]:
merge['log10intensity'] = np.log10(merge[data.columns].mean(axis=1))
merge['desc']=[desc_927[n] for n in merge.index.values]
In [88]:
df.head()
Out[88]:
Protein IDs Majority protein IDs Peptide counts (all) Peptide counts (razor+unique) Peptide counts (unique) Fasta headers Number of proteins Peptides Razor + unique peptides Unique peptides ... Evidence IDs MS/MS IDs Best MS/MS Oxidation (M) site IDs Oxidation (M) site positions Taxonomy IDs unique_int Gene_id desc pep_stats
194 Tb927.5.4450:mRNA-p1;Tb05.5K5.100:mRNA-p1 Tb927.5.4450:mRNA-p1;Tb05.5K5.100:mRNA-p1 8;8 8;8 8;8 Tb927.5.4450:mRNA-p1 | transcript=Tb927.5.4450... 2 8 8 8 ... 18272;19002;32127;53582;71692;71693;75682;8186... 19732;20499;34248;57240;76559;76560;80799;8728... 19732;20499;34248;57240;76559;80799;87287;108592 NaN NaN -1;-1 8 Tb927.5.4450 hypothetical protein, conserved 8
195 Tb927.5.4460:mRNA-p1;Tb05.5K5.110:mRNA-p1;Tb11... Tb927.5.4460:mRNA-p1;Tb05.5K5.110:mRNA-p1;Tb11... 45;45;25;1 45;45;25;1 45;45;25;1 Tb927.5.4460:mRNA-p1 | transcript=Tb927.5.4460... 4 45 45 45 ... 3875;3876;3877;3878;4848;4849;4850;4851;4852;4... 4188;4189;4190;4191;5264;5265;5266;5267;5268;5... 4189;5265;5269;22529;23593;23693;30683;32004;3... 102;103;104;105 251;389;481;828 -1;-1;-1;-1 45 Tb927.5.4460 major vault protein, putative 45
196 Tb927.5.4470:mRNA-p1;Tb05.5K5.120:mRNA-p1 Tb927.5.4470:mRNA-p1;Tb05.5K5.120:mRNA-p1 26;26 26;26 26;26 Tb927.5.4470:mRNA-p1 | transcript=Tb927.5.4470... 2 26 26 26 ... 256;2979;6506;6507;6508;8795;8796;8797;14467;2... 281;3255;7123;7124;7125;9608;9609;9610;15733;3... 281;3255;7123;9609;15733;31067;31866;39199;416... NaN NaN -1;-1 26 Tb927.5.4470 hypothetical protein, conserved 26
197 Tb927.5.4480:mRNA-p1;Tb05.5K5.130:mRNA-p1 Tb927.5.4480:mRNA-p1;Tb05.5K5.130:mRNA-p1 36;36 36;36 36;36 Tb927.5.4480:mRNA-p1 | transcript=Tb927.5.4480... 2 36 36 36 ... 10626;10627;10628;13608;13609;14974;22354;2327... 11546;11547;11548;14745;14746;16255;23998;2498... 11547;11548;14745;16255;23998;24984;28336;3218... 106;107;108;109;110;111 88;207;225;324;349;579 -1;-1 36 Tb927.5.4480 paraflagellar rod component par4, putative 36
198 Tb927.5.4500:mRNA-p1;Tb05.5K5.150:mRNA-p1 Tb927.5.4500:mRNA-p1;Tb05.5K5.150:mRNA-p1 14;14 14;14 14;14 Tb927.5.4500:mRNA-p1 | transcript=Tb927.5.4500... 2 14 14 14 ... 14391;14392;14393;14394;14395;60656;60657;6065... 15649;15650;15651;15652;15653;64795;64796;6479... 15652;64799;72911;74582;74585;103389;104348;12... 112;113;114;115 71;112;146;171 -1;-1 14 Tb927.5.4500 ras-like small GTPase, putative 14

5 rows × 67 columns

In [89]:
desc_dict=dict(zip(df['Gene_id'],df['desc']))
desc_dict['Tb07.11L3.100']
Out[89]:
'SUMO-interacting motif-containing protein'
In [ ]:
 
In [90]:
merge.reset_index().merge(
    df[['Protein IDs','Fasta headers','Gene_id']],
           left_on='index',right_on='Gene_id').to_csv(os.path.join(OUT_FOLDER,'out.csv'))

Web-app dataset

In [91]:
temp = merge.reset_index()
temp = temp.rename({
    'index':'Gene_id',
    'srank_a':'Signed_Rank',
    'log10intensity':'log10_TMT_intensity',
    'FDR_a':'FDR',
    'desc':'Desc'},axis=1)
temp['Gene_acc']=temp.index.values
temp[['Gene_acc','Gene_id','Signed_Rank','log10_TMT_intensity','FDR','Desc']].to_csv(
    'indata_P_M.csv',index=False)
temp[['Gene_acc','Gene_id','Plus1','Plus2','Plus3',
      'Minus1','Minus2','Minus3','Desc']].to_csv('indata2_P_M.csv',index=False)
    
In [93]:
temp = merge.reset_index()
temp = temp.rename({
    'index':'Gene_id',
    'srank_b':'Signed_Rank',
    'log10intensity':'log10_TMT_intensity',
    'FDR_b':'FDR',
    'desc':'Desc'},axis=1)
temp['Gene_acc']=temp.index.values
temp[['Gene_acc','Gene_id','Signed_Rank','log10_TMT_intensity','FDR','Desc']].to_csv(
    'indata_D_P.csv',index=False)
temp[['Gene_acc','Gene_id','WT1','WT2','WT3',
      'Plus1','Plus2','Plus3','Desc']].to_csv('indata2_D_P.csv',index=False)
    
In [94]:
temp = merge.reset_index()
temp = temp.rename({
    'index':'Gene_id',
    'srank_c':'Signed_Rank',
    'log10intensity':'log10_TMT_intensity',
    'FDR_c':'FDR',
    'desc':'Desc'},axis=1)
temp['Gene_acc']=temp.index.values
temp[['Gene_acc','Gene_id','Signed_Rank','log10_TMT_intensity','FDR','Desc']].to_csv(
    'indata_D_M.csv',index=False)
temp[['Gene_acc','Gene_id','WT1','WT2','WT3',
      'Minus1','Minus2','Minus3','Desc']].to_csv('indata2_D_M.csv',index=False)
    
In [95]:
temp = merge.reset_index()
temp = temp.rename({
    'index':'Gene_id',
    'log2FC_a':'Signed_Rank_Plus_vs_Minus',
    'log2FC_b':'Signed_Rank_DM_vs_Plus',
    'FDR_a':'FDR_Plus_vs_Minus',
    'FDR_b':'FDR_DM_vs_Plus',
    'desc':'Desc'},axis=1)
temp['Gene_acc']=temp.index.values

temp[['Gene_acc','Gene_id','Signed_Rank_DM_vs_Plus',
      'FDR_DM_vs_Plus','Signed_Rank_Plus_vs_Minus',
      'FDR_Plus_vs_Minus','Desc']].to_csv('indata_all.csv',index=False)

temp[['Gene_acc','Gene_id','WT1','WT2','WT3',
      'Plus1','Plus2','Plus3',
      'Minus1','Minus2','Minus3','Desc']].to_csv('indata2_all.csv',index=False)
    

Volcano Plot

In [96]:
fig,axes=plt.subplots(figsize=(12,6), ncols=2, nrows=1)
selection = merge[merge['FDR_a']<0.01]
selection_index = selection.index.values
selection_names = [desc_dict.get(prot_id,'none') for prot_id in selection_index]
PTUT.make_vulcano(merge, axes[0], x='log2FC_a', y='FDR_a', 
             annot_index=selection_index,
             annot_names = selection_names,
             title='Volcano',
                  label_for_selection='Regulated',
                 point_size_all=5,alpha_main=0.2,
                  add_text=False,point_size_selection=8)

PTUT.make_vulcano(merge, axes[1], x='log10intensity', y='log2FC_a', 
             annot_index=selection_index,
             annot_names = selection_names,
             title='MA',
                  label_for_selection='Regulated',
                 point_size_all=5,alpha_main=0.2,
                 point_size_selection=8,add_text=False)
axes[1].get_legend().remove()
axes[0].get_legend().remove()
axes[0].invert_yaxis()
plt.suptitle('Plus_vs_Minus')
plt.show()
no selection
no selection
In [97]:
fig,axes=plt.subplots(figsize=(12,6), ncols=2, nrows=1)
selection = merge[merge['FDR_b']<0.01]
selection_index = selection.index.values
selection_names = [desc_dict.get(prot_id,'none') for prot_id in selection_index]
PTUT.make_vulcano(merge, axes[0], x='log2FC_b', y='FDR_b', 
             annot_index=selection_index,
             annot_names = selection_names,
             title='Volcano',
                  label_for_selection='Regulated',
                 point_size_all=5,alpha_main=0.2,
                  add_text=False,point_size_selection=8)

PTUT.make_vulcano(merge, axes[1], x='log10intensity', y='log2FC_b', 
             annot_index=selection_index,
             annot_names = selection_names,
             title='MA',
                  label_for_selection='Regulated',
                 point_size_all=5,alpha_main=0.2,
                 point_size_selection=8,add_text=False)
axes[1].get_legend().remove()
axes[0].get_legend().remove()
axes[0].invert_yaxis()
plt.suptitle('WT_vs_Plus')
plt.show()
no selection
no selection
In [98]:
fig,axes=plt.subplots(figsize=(12,6), ncols=2, nrows=1)
selection = merge[merge['FDR_c']<0.01]
selection_index = selection.index.values
selection_names = [desc_dict.get(prot_id,'none') for prot_id in selection_index]
PTUT.make_vulcano(merge, axes[0], x='log2FC_c', y='FDR_c', 
             annot_index=selection_index,
             annot_names = selection_names,
             title='Volcano',
                  label_for_selection='Regulated',
                 point_size_all=5,alpha_main=0.2,
                  add_text=False,point_size_selection=8)

PTUT.make_vulcano(merge, axes[1], x='log10intensity', y='log2FC_c', 
             annot_index=selection_index,
             annot_names = selection_names,
             title='MA',
                  label_for_selection='Regulated',
                 point_size_all=5,alpha_main=0.2,
                 point_size_selection=8,add_text=False)
axes[1].get_legend().remove()
axes[0].get_legend().remove()
axes[0].invert_yaxis()
plt.suptitle('WT_vs_Plus')
no selection
no selection
Out[98]:
Text(0.5, 0.98, 'WT_vs_Plus')

Volcano Plot

In [101]:
common_regulated = merge[
    #(merge['srank_b']>0)&(merge['srank_a']>0)&
    (merge['FDR_b']<0.01)&(merge['FDR_a']<0.01)
]
common_regulated.shape
Out[101]:
(22, 35)
In [102]:
merge['regulated']=['yes' if n in common_regulated.index.values else 'no' for n in merge.index.values]

common_regulated[['srank_a','FDR_a','srank_b','FDR_b','desc']]
Out[102]:
srank_a FDR_a srank_b FDR_b desc
Tb927.9.3600 -816.848946 0.00000 517.290218 0.00000 Glycosyl transferase family 11, putative
Tb927.11.9790 391.781007 0.00000 -183.586077 0.00000 calmodulin, putative
Tb927.11.9860 285.397879 0.00000 -57.925303 0.00835 EF-hand domain pair, putative
Tb927.4.3430 278.446048 0.00000 -55.640874 0.00955 Mitochondrial import inner membrane translocas...
Tb927.3.2180 -257.798185 0.00000 -110.242776 0.00205 Mitochondrial ATP synthase subunit, putative
Tb927.9.1610 227.458117 0.00000 -82.884741 0.00306 CDP-diacylglycerol--inositol 3-phosphatidyltra...
Tb927.11.1330 210.580113 0.00000 -71.236032 0.00532 Stress responsive A/B Barrel Domain, putative
Tb927.10.3380 189.256815 0.00000 -135.027585 0.00136 60S acidic ribosomal protein P2, putative
Tb927.11.5660 187.086118 0.00000 -267.322954 0.00000 hypothetical protein, conserved
Tb927.10.9025 160.455784 0.00000 100.427246 0.00250 hypothetical protein
Tb927.7.840 -157.423935 0.00000 -64.145574 0.00678 Mitochondrial ATP synthase subunit, putative
Tb927.9.3290 130.259603 0.00022 -54.387345 0.00981 4F5 protein family, putative
Tb927.2.6220 119.794435 0.00115 202.721274 0.00000 adenosine transporter 2, putative
Tb927.3.2880 -118.746835 0.00127 -78.918860 0.00365 Mitochondrial ATP synthase subunit, putative
Tb927.5.3090 -103.550136 0.00212 -72.837061 0.00484 Mitochondrial ATP synthase subunit, putative
Tb927.7.4700 102.279396 0.00221 169.609145 0.00050 WD domain, G-beta repeat, putative
Tb927.8.6090 88.716144 0.00404 -94.139005 0.00268 ubiquitin-protein ligase, putative
Tb427.BES122.9 72.174190 0.00580 -146.337732 0.00050 none
Tb11.v5.0905 -69.032665 0.00660 95.396614 0.00268 hypothetical protein, conserved
Tb927.5.370 -62.341645 0.00814 398.114787 0.00000 75 kDa invariant surface glycoprotein, putative
Tb927.11.1270 -62.102216 0.00814 -54.489925 0.00981 Mitochondrial ATP synthase subunit, putative
Tb927.10.15410 59.616475 0.00863 -118.591362 0.00164 glycosomal malate dehydrogenase
In [112]:
#for item in list(desc_dict.keys()):
#    print(item.split('.')[0])

plt.style.use('ggplot')

fig,ax=plt.subplots(figsize=(12,8))
merge.plot(kind='scatter', x='log2FC_a', y='log2FC_b',
           ax=ax,marker='.',alpha=0.2,label='MS Identified')
merge.loc[
    common_regulated.index.values
].plot(kind='scatter',x='log2FC_a',y='log2FC_b',ax=ax,c='r',label='Significant')
ax.set_xlabel('DOWN  <-- Plus/Minus --> UP',fontsize=12)
ax.set_ylabel('Down  <-- WT/Plus --> UP',fontsize=12)


ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),title= 'T. brucei proteins')
leg = ax.get_legend()
leg.legendHandles[0].set_color('Blue')
leg.legendHandles[0].set_alpha(1)
 #none}
    
texts = [ax.text(merge.loc[i]['log2FC_a'], 
                       merge.loc[i]['log2FC_b'],
                       desc_dict.get(i,'none'))
                       for i in common_regulated.index.values]

adjust_text(texts, arrowprops=dict(arrowstyle='->',
                                   color='red'),ax=ax)

plt.title('Common')
plt.tight_layout()
plt.savefig(os.path.join(OUT_FOLDER,'Fig1.png'))
plt.savefig(os.path.join(OUT_FOLDER,'Fig1.svg'))
plt.show()
In [105]:
#'Tb927.11.13140',
look_for = ['Tb927.11.17000','Tb927.9.6090','Tb927.8.7700']
#for item in list(desc_dict.keys()):
#    print(item.split('.')[0])

plt.style.use('ggplot')

fig,ax=plt.subplots(figsize=(12,8))
merge.plot(kind='scatter', x='log2FC_a', y='log2FC_b',
           ax=ax,marker='.',alpha=0.2,label='MS Identified')
merge.loc[
    look_for
].plot(kind='scatter',x='log2FC_a',y='log2FC_b',ax=ax,c='r',label='Significant')
ax.set_xlabel('DOWN  <-- Plus/Minus --> UP',fontsize=12)
ax.set_ylabel('Down  <-- DM/Plus --> UP',fontsize=12)


ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),title= 'T. brucei proteins')
leg = ax.get_legend()
leg.legendHandles[0].set_color('Blue')
leg.legendHandles[0].set_alpha(1)
 #none}
    
texts = [ax.text(merge.loc[i]['log2FC_a'], 
                       merge.loc[i]['log2FC_b'],
                       desc_dict.get(i,'none'))
                       for i in look_for]

adjust_text(texts, arrowprops=dict(arrowstyle='->',
                                   color='red'),ax=ax)

plt.title('Common')
plt.tight_layout()
plt.savefig(os.path.join(OUT_FOLDER,'Fig2.png'))
plt.savefig(os.path.join(OUT_FOLDER,'Fig2.svg'))
plt.show()
In [107]:
look_for = ['Tb927.11.17000','Tb927.9.6090','Tb927.8.7700']
#for item in list(desc_dict.keys()):
#    print(item.split('.')[0])

plt.style.use('ggplot')

fig,ax=plt.subplots(figsize=(12,8))
merge.plot(kind='scatter', x='FDR_a', y='FDR_b',
           ax=ax,marker='.',alpha=0.2,label='MS Identified')
merge.loc[
    look_for
].plot(kind='scatter',x='FDR_a',y='FDR_b',ax=ax,c='r',label='Significant')
ax.set_xlabel('FDR_a',fontsize=12)
ax.set_ylabel('FDR_b',fontsize=12)


ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),title= 'T. brucei proteins')
leg = ax.get_legend()
leg.legendHandles[0].set_color('Blue')
leg.legendHandles[0].set_alpha(1)
 #none}
    
texts = [ax.text(merge.loc[i]['FDR_a'], 
                       merge.loc[i]['FDR_b'],
                       desc_dict.get(i,'none'))
                       for i in look_for]

adjust_text(texts, arrowprops=dict(arrowstyle='->',
                                   color='red'),ax=ax)

plt.title('Common')
plt.tight_layout()
plt.savefig(os.path.join(OUT_FOLDER,'Fig4.png'))
plt.savefig(os.path.join(OUT_FOLDER,'Fig4.svg'))
plt.show()
In [ ]:
 
In [108]:
import plotly.express as px
In [109]:
merge.columns
Out[109]:
Index(['id_a', 'rank_a', 'FDR_a', 'sign_a', 'log_FDR_a', 'log_rank_a', 'srank_a', 'id_b', 'rank_b', 'FDR_b', 'sign_b', 'log_FDR_b', 'log_rank_b', 'srank_b',
       'id_c', 'rank_c', 'FDR_c', 'sign_c', 'log_FDR_c', 'log_rank_c', 'srank_c', 'WT1', 'WT2', 'WT3', 'Plus1', 'Plus2', 'Plus3', 'Minus1', 'Minus2', 'Minus3',
       'log2FC_a', 'log2FC_b', 'log2FC_c', 'log10intensity', 'desc', 'regulated'],
      dtype='object')
In [110]:
import plotly.express as px
df = px.data.iris() # iris is a pandas DataFrame
fig = px.scatter(merge, x="log2FC_a", y="log2FC_b",
                 color="regulated",hover_name=merge.index.values,
                hover_data=['FDR_a','FDR_b','desc'])
fig.write_html('scatter.html', auto_open=False)
fig.show()

Proteins Bar Plots

In [111]:
for prot in list(common_regulated.index.values):
    plot_bar(ingene=prot,log=False,title=prot+' '+desc_dict.get(prot,'none'))
Glycosyl transferase family 11, putative
['Tb927.9.3600:mRNA-p1']
unique peptides: [4]
calmodulin, putative
['Tb927.11.9790:mRNA-p1']
unique peptides: [6]
EF-hand domain pair, putative
['Tb927.11.9860:mRNA-p1']
unique peptides: [7]
Mitochondrial import inner membrane translocase subunit TIM12
['Tb927.4.3430:mRNA-p1']
unique peptides: [2]
Mitochondrial ATP synthase subunit, putative
['Tb927.3.2180:mRNA-p1']
unique peptides: [3]
CDP-diacylglycerol--inositol 3-phosphatidyltransferase, putative
['Tb927.9.1610:mRNA-p1']
unique peptides: [3]
Stress responsive A/B Barrel Domain, putative
['Tb927.11.1330:mRNA-p1', 'Tb927.11.13300:mRNA-p1']
unique peptides: [4, 5]
60S acidic ribosomal protein P2, putative
['Tb927.10.3380:mRNA-p1;Tb927.10.3370:mRNA-p1']
unique peptides: [5]
hypothetical protein, conserved
['Tb927.11.5660:mRNA-p1']
unique peptides: [2]
hypothetical protein
['Tb927.10.9025:mRNA-p1']
unique peptides: [4]
Mitochondrial ATP synthase subunit, putative
['Tb927.7.840:mRNA-p1']
unique peptides: [4]
4F5 protein family, putative
['Tb927.9.3290:mRNA-p1']
unique peptides: [3]
adenosine transporter 2, putative
['Tb927.2.6220:mRNA-p1']
unique peptides: [5]
Mitochondrial ATP synthase subunit, putative
['Tb927.3.2880:mRNA-p1']
unique peptides: [4]
Mitochondrial ATP synthase subunit, putative
['Tb927.5.3090:mRNA-p1']
unique peptides: [7]
WD domain, G-beta repeat, putative
['Tb927.7.4700:mRNA-p1']
unique peptides: [1]
ubiquitin-protein ligase, putative
['Tb927.8.6090:mRNA-p1']
unique peptides: [3]
none
['Tb427.BES122.9;Tb927.5.4610:pseudogenic_transcript-p1']
unique peptides: [1]
hypothetical protein, conserved
['Tb11.v5.0905.1-p1']
unique peptides: [0]
75 kDa invariant surface glycoprotein, putative
['Tb927.5.370:mRNA-p1;Tb11.v5.0230.1-p1', 'Tb927.5.3700:mRNA-p1']
unique peptides: [4, 4]
Mitochondrial ATP synthase subunit, putative
['Tb927.11.1270:mRNA-p1']
unique peptides: [8]
glycosomal malate dehydrogenase
['Tb927.10.15410:mRNA-p1']
unique peptides: [6]
In [113]:
!jupyter nbconvert --to html_toc analysis.ipynb
[NbConvertApp] Converting notebook analysis.ipynb to html_toc
[NbConvertApp] Support files will be in analysis_files/
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Making directory analysis_files
[NbConvertApp] Writing 4705330 bytes to analysis.html
In [ ]:
 
In [ ]: